from time import sleep
import requests
import parsel
from urllib import request
import pandas as pd
import os


def getsuffix(url):
    suffix = url.split(".")[-1]
    return suffix


def getDoubantop250(page=1):
    start = 25 * (page - 1)
    url = "https://movie.douban.com/top250?start={start}&filter=".format(
        start=start)
    # 请求头
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.42"
    }
    # 爬取
    myresponse = requests.get(url=url, headers=header)
    # 分析层次结构
    html = parsel.Selector(myresponse.text)
    datas = html.xpath("//ol[@class=\"grid_view\"]/li")

    # 获取数据
    table_header = ['电影名', '导演', '短评', '评分', '剧照']
    movies_data = []

    for data in datas:
        movie_title = data.xpath(
            'div/div[2]/div[@class="hd"]/a/span[1]/text()').get()
        movie_daoyan = data.xpath(
            'div/div[2]/div[@class="bd"]/p[1]/text()').get()
        movie_quote = data.xpath(
            'div/div[2]/div[@class="bd"]/p[2]/span/text()').get()
        if not movie_quote:
            movie_quote = ""
        movie_stars = data.xpath(
            'div/div[2]/div[@class="bd"]/div[1]/span[2]/text()').get()
        movie_pic = data.xpath('div/div[1]/a/img/@src').get()
        movies_data.append({table_header[0]: movie_title.strip(),
                            table_header[1]: movie_daoyan.strip(),
                            table_header[2]: movie_quote.strip(),
                            table_header[3]: float(movie_stars.strip()),
                            table_header[4]: movie_pic.strip()})
    df = pd.DataFrame(data=movies_data, columns=table_header)
    return df

# 上下文，文件存放的目录
context = './data'
img_context = f'{context}/img'
if not os.path.isdir(context):
    raise FileNotFoundError(f"{context} not exist or not a dir")
os.makedirs(img_context, exist_ok=True)

# 爬取1~10页
movies_info = []
for page in range(1, 11):
    print(f"正在爬取第{page}页···")
    sleep(1.5)  # 防止访问过于频繁触发豆瓣网的反扒机制
    df = getDoubantop250(page)
    movies_info.append(df)
movies_info = pd.concat(movies_info, ignore_index=True)
movies_info.to_excel(f"{context}/movies.xlsx", sheet_name="电影", index=False)

# 下载剧照
for idx, rows in movies_info.loc[:, ['电影名', '剧照']].iterrows():
    url = rows['剧照']
    filename = "{context}/{filename}.{suffix}".format(
        context=img_context, filename=rows['电影名'], suffix=getsuffix(url))
    request.urlretrieve(url=url, filename=filename)
    if (idx + 1) % 25 == 0:
        print("已下载{:d}副剧照".format(idx+1))
        sleep(1)
